{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 导入相应的包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "#皮尔逊相关系数\n",
    "from scipy.stats import pearsonr\n",
    "#标准化\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "#数据划分\n",
    "from sklearn.model_selection import train_test_split\n",
    "#KNN\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "#交叉检验\n",
    "from sklearn.model_selection import cross_val_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 获取数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data=pd.read_csv(\"./winequality-white.csv\",sep=\";\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 比较各特征间的皮尔逊相关系数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fixed acidity和volatile acidity之间的皮尔逊相关系数为-0.022697\n",
      "fixed acidity和citric acid之间的皮尔逊相关系数为0.289181\n",
      "fixed acidity和residual sugar之间的皮尔逊相关系数为0.089021\n",
      "fixed acidity和chlorides之间的皮尔逊相关系数为0.023086\n",
      "fixed acidity和free sulfur dioxide之间的皮尔逊相关系数为-0.049396\n",
      "fixed acidity和total sulfur dioxide之间的皮尔逊相关系数为0.091070\n",
      "fixed acidity和density之间的皮尔逊相关系数为0.265331\n",
      "fixed acidity和pH之间的皮尔逊相关系数为-0.425858\n",
      "fixed acidity和sulphates之间的皮尔逊相关系数为-0.017143\n",
      "fixed acidity和alcohol之间的皮尔逊相关系数为-0.120881\n",
      "fixed acidity和quality之间的皮尔逊相关系数为-0.113663\n",
      "volatile acidity和citric acid之间的皮尔逊相关系数为-0.149472\n",
      "volatile acidity和residual sugar之间的皮尔逊相关系数为0.064286\n",
      "volatile acidity和chlorides之间的皮尔逊相关系数为0.070512\n",
      "volatile acidity和free sulfur dioxide之间的皮尔逊相关系数为-0.097012\n",
      "volatile acidity和total sulfur dioxide之间的皮尔逊相关系数为0.089261\n",
      "volatile acidity和density之间的皮尔逊相关系数为0.027114\n",
      "volatile acidity和pH之间的皮尔逊相关系数为-0.031915\n",
      "volatile acidity和sulphates之间的皮尔逊相关系数为-0.035728\n",
      "volatile acidity和alcohol之间的皮尔逊相关系数为0.067718\n",
      "volatile acidity和quality之间的皮尔逊相关系数为-0.194723\n",
      "citric acid和residual sugar之间的皮尔逊相关系数为0.094212\n",
      "citric acid和chlorides之间的皮尔逊相关系数为0.114364\n",
      "citric acid和free sulfur dioxide之间的皮尔逊相关系数为0.094077\n",
      "citric acid和total sulfur dioxide之间的皮尔逊相关系数为0.121131\n",
      "citric acid和density之间的皮尔逊相关系数为0.149503\n",
      "citric acid和pH之间的皮尔逊相关系数为-0.163748\n",
      "citric acid和sulphates之间的皮尔逊相关系数为0.062331\n",
      "citric acid和alcohol之间的皮尔逊相关系数为-0.075729\n",
      "citric acid和quality之间的皮尔逊相关系数为-0.009209\n",
      "residual sugar和chlorides之间的皮尔逊相关系数为0.088685\n",
      "residual sugar和free sulfur dioxide之间的皮尔逊相关系数为0.299098\n",
      "residual sugar和total sulfur dioxide之间的皮尔逊相关系数为0.401439\n",
      "residual sugar和density之间的皮尔逊相关系数为0.838966\n",
      "residual sugar和pH之间的皮尔逊相关系数为-0.194133\n",
      "residual sugar和sulphates之间的皮尔逊相关系数为-0.026664\n",
      "residual sugar和alcohol之间的皮尔逊相关系数为-0.450631\n",
      "residual sugar和quality之间的皮尔逊相关系数为-0.097577\n",
      "chlorides和free sulfur dioxide之间的皮尔逊相关系数为0.101392\n",
      "chlorides和total sulfur dioxide之间的皮尔逊相关系数为0.198910\n",
      "chlorides和density之间的皮尔逊相关系数为0.257211\n",
      "chlorides和pH之间的皮尔逊相关系数为-0.090439\n",
      "chlorides和sulphates之间的皮尔逊相关系数为0.016763\n",
      "chlorides和alcohol之间的皮尔逊相关系数为-0.360189\n",
      "chlorides和quality之间的皮尔逊相关系数为-0.209934\n",
      "free sulfur dioxide和total sulfur dioxide之间的皮尔逊相关系数为0.615501\n",
      "free sulfur dioxide和density之间的皮尔逊相关系数为0.294210\n",
      "free sulfur dioxide和pH之间的皮尔逊相关系数为-0.000618\n",
      "free sulfur dioxide和sulphates之间的皮尔逊相关系数为0.059217\n",
      "free sulfur dioxide和alcohol之间的皮尔逊相关系数为-0.250104\n",
      "free sulfur dioxide和quality之间的皮尔逊相关系数为0.008158\n",
      "total sulfur dioxide和density之间的皮尔逊相关系数为0.529881\n",
      "total sulfur dioxide和pH之间的皮尔逊相关系数为0.002321\n",
      "total sulfur dioxide和sulphates之间的皮尔逊相关系数为0.134562\n",
      "total sulfur dioxide和alcohol之间的皮尔逊相关系数为-0.448892\n",
      "total sulfur dioxide和quality之间的皮尔逊相关系数为-0.174737\n",
      "density和pH之间的皮尔逊相关系数为-0.093591\n",
      "density和sulphates之间的皮尔逊相关系数为0.074493\n",
      "density和alcohol之间的皮尔逊相关系数为-0.780138\n",
      "density和quality之间的皮尔逊相关系数为-0.307123\n",
      "pH和sulphates之间的皮尔逊相关系数为0.155951\n",
      "pH和alcohol之间的皮尔逊相关系数为0.121432\n",
      "pH和quality之间的皮尔逊相关系数为0.099427\n",
      "sulphates和alcohol之间的皮尔逊相关系数为-0.017433\n",
      "sulphates和quality之间的皮尔逊相关系数为0.053678\n",
      "alcohol和quality之间的皮尔逊相关系数为0.435575\n"
     ]
    }
   ],
   "source": [
    "data_title = data.columns.values.tolist()\n",
    "for i in range(len(data_title)):\n",
    "    for j in range(i+1,len(data_title)):\n",
    "        print('%s和%s之间的皮尔逊相关系数为%f'%(data_title[i],data_title[j],pearsonr(data[data_title[i]],data[data_title[j]])[0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 给白酒质量分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Quality(e):\n",
    "    if e>0 and e<=4:\n",
    "        return 'bad'\n",
    "    elif e>4 and e<=7:\n",
    "        return 'ordinary'\n",
    "    else:\n",
    "        return 'good'\n",
    "data['Quality'] = data['quality'].map(Quality)\n",
    "data.drop(['quality'],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 去掉皮尔逊相关系数高的特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 去掉“free sulfur dioxide”、“density”，“pH”\n",
    "data2=data[[\"fixed acidity\",\"volatile acidity\",\"citric acid\",\"residual sugar\",\"chlorides\",\"free sulfur dioxide\",\"sulphates\",\"alcohol\"]]\n",
    "data3=data.Quality"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 机器学习"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 划分数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train,x_test,y_train,y_test = train_test_split(data2,data3,test_size =0.2,random_state = 6)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 五折交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "平均准确率 0.9105768068207876\n",
      "每则准确率 [0.92040816 0.89081633 0.91530612 0.91215526 0.91419816]\n"
     ]
    }
   ],
   "source": [
    "#使用KNN算法模型\n",
    "model = KNeighborsClassifier()\n",
    "\n",
    "score=cross_val_score(model,data2,data3,cv=5,scoring='accuracy')\n",
    "\n",
    "print ('平均准确率',score.mean())\n",
    "print ('每则准确率',score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "361px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
